import pandas as pd
import plotly.express as px
!pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
Requirement already satisfied: scikit-learn in c:\users\ealija\anaconda3\lib\site-packages (1.0.2) Requirement already satisfied: scipy>=1.1.0 in c:\users\ealija\anaconda3\lib\site-packages (from scikit-learn) (1.7.3) Requirement already satisfied: numpy>=1.14.6 in c:\users\ealija\anaconda3\lib\site-packages (from scikit-learn) (1.21.5) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\ealija\anaconda3\lib\site-packages (from scikit-learn) (2.2.0) Requirement already satisfied: joblib>=0.11 in c:\users\ealija\anaconda3\lib\site-packages (from scikit-learn) (1.1.0)
# Load the data set
df = pd.read_csv("C:/Users/Ealija/Downloads/IMBD_TOP_1000.csv")
# Pie chart to show the distribution of movies by genre
genre_count = df["Genre"].value_counts().reset_index()
genre_count.columns = ["Genre", "Count"]
fig = px.pie(genre_count, values="Count", names="Genre", title="Distribution of Movies by Genre")
fig.show()
df['Gross'] = df['Gross'].str.replace(',', '').astype(float)
# Scatter plot to show the relationship between runtime and gross
fig = px.scatter(df, x="Runtime (min)", y="Gross", title="Relationship Between Runtime and Gross", trendline="ols")
fig.show()
# Bar plot to show the total gross by director
director_gross = df.groupby("Director", as_index=False)["Gross"].sum().sort_values(by="Gross", ascending=False)[:10]
fig = px.bar(director_gross, x="Director", y="Gross", title="Total Gross by Director")
fig.show()
# Line plot to show the trend of gross over the years
gross_year = df.groupby("Released_Year", as_index=False)["Gross"].sum()
fig = px.line(gross_year, x="Released_Year", y="Gross", title="Trend of Gross over the Years")
fig.show()
# Visualization for Runtime vs Meta_score
fig = px.scatter(df, x="Runtime (min)", y="Meta_score", title="Relationship Between Runtime and Meta_score")
fig.show()
# Box plot to show the distribution of IMDB ratings by certificate
fig = px.box(df, x="Certificate", y="IMDB_Rating", title="Distribution of IMDB Ratings by Certificate")
fig.show()
# Group the data by genre and calculate the total gross and average runtime for each genre
genre_stats = df.groupby("Genre").agg({"Gross": "sum", "Runtime (min)": "mean"}).reset_index()
# Group the data by director and calculate the average IMDB rating for each director
director_rating = df.groupby("Director").agg({"IMDB_Rating": "mean"}).reset_index()
# Display the resulting data frames
print(genre_stats)
print(director_rating)
Genre Gross Runtime (min)
0 Action 2.212587e+10 129.046512
1 Adventure 6.022137e+09 134.111111
2 Animation 9.594346e+09 99.585366
3 Biography 5.362483e+09 136.022727
4 Comedy 6.200388e+08 95.000000
5 Comedy 5.381711e+09 113.697183
6 Crime 4.132271e+09 126.392523
7 Drama 2.701246e+09 127.211765
8 Drama 1.009115e+10 123.705882
9 Family 4.391106e+08 107.500000
10 Fantasy 1.360695e+08 85.000000
11 Film-Noir 7.059200e+07 104.000000
12 Horror 2.355221e+08 111.000000
13 Horror 5.683703e+08 100.111111
14 Mystery 4.780601e+08 119.083333
15 Thriller 1.755074e+07 108.000000
16 Western 5.822151e+07 148.250000
Director IMDB_Rating
0 Aamir Khan 8.40
1 Aaron Sorkin 7.80
2 Abdellatif Kechiche 7.70
3 Abhishek Chaubey 7.80
4 Abhishek Kapoor 7.70
.. ... ...
543 Zack Snyder 7.60
544 Zaza Urushadze 8.20
545 Zoya Akhtar 8.05
546 Çagan Irmak 8.30
547 Ömer Faruk Sorak 8.00
[548 rows x 2 columns]
# Select desired columns and create new dataframe
df1 = df[["Series_Title", "IMDB_Rating", "Gross"]]
# Create a new dataframe with the top 10 highest grossing movies
df2 = df1.nlargest(10, "Gross")
# Perform a left join to merge the two dataframes on the "Series_Title" column
merged_df = pd.merge(df1, df2, on="Series_Title", how="left")
print(merged_df.head())
Series_Title IMDB_Rating_x Gross_x IMDB_Rating_y \
0 The Shawshank Redemption 9.3 28341469.0 NaN
1 The Godfather 9.2 134966411.0 NaN
2 The Dark Knight 9.0 534858444.0 9.0
3 The Godfather: Part II 9.0 57300000.0 NaN
4 12 Angry Men 9.0 4360000.0 NaN
Gross_y
0 NaN
1 NaN
2 534858444.0
3 NaN
4 NaN
#This code reads in the CSV file and creates a new dataframe with only the "Series_Title", "IMDB_Rating", and "Gross" columns. It then creates another dataframe with the top 10 highest grossing movies based on the "Gross" column. Finally, it performs a left join to merge the two dataframes on the "Series_Title" column, keeping all the rows from the original dataframe and filling in missing values from the top 10 dataframe with NaN values.
#This join can give helpful statistics to the cinema on the relationship between a movie's IMDB rating and its gross earnings.
#Added mean values to missing data points in GRoss
mean_gross = df["Gross"].mean()
df["Gross"].fillna(mean_gross, inplace=True)
#Added mean values to missing data points in meta_score
df["Meta_score"].fillna(df["Meta_score"].mean(), inplace=True)
# Define the independent variables and the dependent variable
X = df[["Runtime (min)", "IMDB_Rating", "Meta_score"]]
y = df["Gross"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Create a linear regression model
model = LinearRegression()
# Train the model on the training data
model.fit(X_train, y_train)
LinearRegression()
# Make predictions on the testing data
y_pred = model.predict(X_test)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)
Mean squared error: 1.0227470671185832e+16